# !pip install plotly-express
import pandas as pd
import numpy as np
from datetime import date
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import seaborn as sns
from matplotlib.colors import ListedColormap
from matplotlib import pyplot as plt
import plotly.express as px
%matplotlib inline
df=pd.read_csv('UDEMY_DATA.csv')
df
| best_seller | course_rating_avarage | course_price | num_of_buyers_students | amount_of_instructor_studnets | num_articles | video_time_length | number_of_languages | last_update | instructor_rank | amount_of_what_you_will_learn_count | amount_of_requirments_count | amount_of_companies_support | course_rating_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 4.7 | 69.90 | 493,315 | 1,415,946 | 230 | 60 | 13 | 12.2021 | 4.7 | 9 | 8 | 5 | 108,821 |
| 1 | 0 | 4.6 | 69.90 | 1,533,407 | 2,853,435 | 14 | 22 | 9 | 3.2021 | 4.6 | 12 | 6 | 5 | 418,238 |
| 2 | 0 | 4.7 | 89.90 | 150,857 | 201,184 Reviews | 56 | 30.5 | 8 | 5.2022 | 4.7 | 18 | 7 | 5 | 36,525 |
| 3 | 0 | 4.6 | 89.90 | 375,779 | 53 Courses | 12 | 70 | 8 | 3.2022 | 4.5 | 7 | 7 | 5 | 90,083 |
| 4 | 1 | 4.6 | 59.90 | 542,316 | 2,853,435 | 13 | 25 | 9 | 5.2020 | 4.6 | 16 | 6 | 5 | 115,128 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26065 | 0 | 4.5 | 59.90 | 11 | 112 | -1 | 1 hour on-demand video | 2 | 7.2021 | 4.0 | 6 | 4 | 0 | 1 rating) |
| 26066 | 0 | 3.7 | 59.90 | 68 | 6,057 | 1 | 19.5 | 2 | 5.2022 | 3.4 | 23 | 6 | 0 | 14 |
| 26067 | 0 | 3.9 | 699.90 | 15 | 40 | -1 | 17.5 | 2 | 6.2021 | 3.6 | 8 | 5 | 0 | 2 |
| 26068 | 0 | 4.4 | 59.90 | 3,063 | 6,956 | -1 | 3 | 2 | 4.2018 | 4.1 | 35 | 6 | 0 | 164 |
| 26069 | 0 | 4.6 | 59.90 | 596 | 596 | 1 | 10 | 2 | 1.2021 | 4.6 | 10 | 6 | 0 | 128 |
26070 rows × 14 columns
Markdown:
columns = ["best_seller", "course_rating_avarage", "course_price", "num_of_buyers_students", "amount_of_instructor_studnets", "num_articles", "video_time_length", "number_of_languages", "last_update", "instructor_rank", "amount_of_what_you_will_learn_count", "amount_of_requirments_count", "amount_of_companies_support", "course_rating_amount"]
for column in columns:
df.drop(df.index[df[column] == -1], inplace=True)
df = df.reset_index(drop=True)
df
| best_seller | course_rating_avarage | course_price | num_of_buyers_students | amount_of_instructor_studnets | num_articles | video_time_length | number_of_languages | last_update | instructor_rank | amount_of_what_you_will_learn_count | amount_of_requirments_count | amount_of_companies_support | course_rating_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 4.7 | 69.90 | 493,315 | 1,415,946 | 230 | 60 | 13 | 12.2021 | 4.7 | 9 | 8 | 5 | 108,821 |
| 1 | 0 | 4.6 | 69.90 | 1,533,407 | 2,853,435 | 14 | 22 | 9 | 3.2021 | 4.6 | 12 | 6 | 5 | 418,238 |
| 2 | 0 | 4.7 | 89.90 | 150,857 | 201,184 Reviews | 56 | 30.5 | 8 | 5.2022 | 4.7 | 18 | 7 | 5 | 36,525 |
| 3 | 0 | 4.6 | 89.90 | 375,779 | 53 Courses | 12 | 70 | 8 | 3.2022 | 4.5 | 7 | 7 | 5 | 90,083 |
| 4 | 1 | 4.6 | 59.90 | 542,316 | 2,853,435 | 13 | 25 | 9 | 5.2020 | 4.6 | 16 | 6 | 5 | 115,128 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10881 | 0 | 4.3 | 59.90 | 3,754 | 40,786 | 1 | 5 | 2 | 1.2022 | 4.4 | 7 | 6 | 5 | 609 |
| 10882 | 0 | 4.6 | 59.90 | 16,937 | 68,564 | 2 | 24 | 3 | 4.2019 | 4.6 | 4 | 7 | 5 | 2,138 |
| 10883 | 0 | 4.5 | 59.90 | 1,963 | 848,521 | 14 | 1.5 | 2 | 5.2020 | 4.4 | 6 | 5 | 0 | 2 |
| 10884 | 0 | 3.7 | 59.90 | 68 | 6,057 | 1 | 19.5 | 2 | 5.2022 | 3.4 | 23 | 6 | 0 | 14 |
| 10885 | 0 | 4.6 | 59.90 | 596 | 596 | 1 | 10 | 2 | 1.2021 | 4.6 | 10 | 6 | 0 | 128 |
10886 rows × 14 columns
Markdown:
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace(" Reviews", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace(" Courses", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace(" Course", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace(" Review", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace("1Student", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace("1 Student", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace(",", "")
df['amount_of_instructor_studnets'] = df['amount_of_instructor_studnets'].str.replace("--", "")
df["amount_of_instructor_studnets"] = pd.to_numeric(df["amount_of_instructor_studnets"], downcast='integer')
df["amount_of_instructor_studnets"] = df["amount_of_instructor_studnets"].astype('Int64')
df['amount_of_instructor_studnets']
0 1415946
1 2853435
2 201184
3 53
4 2853435
...
10881 40786
10882 68564
10883 848521
10884 6057
10885 596
Name: amount_of_instructor_studnets, Length: 10886, dtype: Int64
df["last_update"].drop(df.index[(((df["last_update"].str.len() - df["last_update"].str.index("."))-1) != 4)], inplace=True, axis=0)
df['last_update'] = df['last_update'].str.replace("Published ", "")
total_diffrence = []
for row in df["last_update"]:
month_diffrence = (int(date.today().month) - int(row.split(".",1)[0])) * 30
year_diffrence = (int(date.today().year) - int(row.split(".",1)[1])) * 365
total_diffrence.append(month_diffrence + year_diffrence)
df["last_update"] = total_diffrence
df["last_update"]
0 185
1 455
2 30
3 90
4 760
...
10881 150
10882 1155
10883 760
10884 30
10885 515
Name: last_update, Length: 10886, dtype: int64
df['course_rating_amount'] = df['course_rating_amount'].str.replace(",", "")
df['course_rating_amount'] = df['course_rating_amount'].str.replace("1 rating", "0")
df['course_rating_amount'] = df['course_rating_amount'].str.replace(")", "")
df["course_rating_amount"] = pd.to_numeric(df["course_rating_amount"], downcast='integer')
df["course_rating_amount"] = df["course_rating_amount"].astype('Int64')
df['course_rating_amount']
D:\Anaconda\Anaconda3\envs\mlcourse\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. This is separate from the ipykernel package so we can avoid doing imports until
0 108821
1 418238
2 36525
3 90083
4 115128
...
10881 609
10882 2138
10883 2
10884 14
10885 128
Name: course_rating_amount, Length: 10886, dtype: Int64
df['num_of_buyers_students'] = df['num_of_buyers_students'].str.replace(",", "")
df['num_of_buyers_students'] = df['num_of_buyers_students'].str.replace("1 student", "1")
df["num_of_buyers_students"] = pd.to_numeric(df["num_of_buyers_students"], downcast='integer')
df["num_of_buyers_students"] = df["num_of_buyers_students"].astype('Int64')
df['num_of_buyers_students']
0 493315
1 1533407
2 150857
3 375779
4 542316
...
10881 3754
10882 16937
10883 1963
10884 68
10885 596
Name: num_of_buyers_students, Length: 10886, dtype: Int64
df["course_price"] = pd.to_numeric(df["course_price"], downcast='float')
df["course_price"] = df["course_price"].astype('float64')
df['course_price']
0 69.900002
1 69.900002
2 89.900002
3 89.900002
4 59.900002
...
10881 59.900002
10882 59.900002
10883 59.900002
10884 59.900002
10885 59.900002
Name: course_price, Length: 10886, dtype: float64
df['video_time_length'] = df['video_time_length'].str.replace(" mins on-demand video", "")
df['video_time_length'] = df['video_time_length'].str.replace(" hour on-demand video", "")
df["video_time_length"] = pd.to_numeric(df["video_time_length"], downcast='float')
df["video_time_length"] = df["video_time_length"].astype('float64')
df['video_time_length']
0 60.0
1 22.0
2 30.5
3 70.0
4 25.0
...
10881 5.0
10882 24.0
10883 1.5
10884 19.5
10885 10.0
Name: video_time_length, Length: 10886, dtype: float64
df['instructor_rank'] = df['instructor_rank'].str.replace("--", "0")
df["instructor_rank"] = pd.to_numeric(df["instructor_rank"], downcast='float')
df["instructor_rank"] = df["instructor_rank"].astype('float64')
df['instructor_rank']
0 4.7
1 4.6
2 4.7
3 4.5
4 4.6
...
10881 4.4
10882 4.6
10883 4.4
10884 3.4
10885 4.6
Name: instructor_rank, Length: 10886, dtype: float64
columns = ["best_seller", "course_rating_avarage", "course_price", "num_of_buyers_students", "amount_of_instructor_studnets", "num_articles", "video_time_length", "number_of_languages", "last_update", "instructor_rank", "amount_of_what_you_will_learn_count", "amount_of_requirments_count", "amount_of_companies_support", "course_rating_amount"]
for column in columns:
df[column] = df[column].round(2)
df
| best_seller | course_rating_avarage | course_price | num_of_buyers_students | amount_of_instructor_studnets | num_articles | video_time_length | number_of_languages | last_update | instructor_rank | amount_of_what_you_will_learn_count | amount_of_requirments_count | amount_of_companies_support | course_rating_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 4.7 | 69.9 | 493315 | 1415946 | 230 | 60.0 | 13 | 185 | 4.7 | 9 | 8 | 5 | 108821 |
| 1 | 0 | 4.6 | 69.9 | 1533407 | 2853435 | 14 | 22.0 | 9 | 455 | 4.6 | 12 | 6 | 5 | 418238 |
| 2 | 0 | 4.7 | 89.9 | 150857 | 201184 | 56 | 30.5 | 8 | 30 | 4.7 | 18 | 7 | 5 | 36525 |
| 3 | 0 | 4.6 | 89.9 | 375779 | 53 | 12 | 70.0 | 8 | 90 | 4.5 | 7 | 7 | 5 | 90083 |
| 4 | 1 | 4.6 | 59.9 | 542316 | 2853435 | 13 | 25.0 | 9 | 760 | 4.6 | 16 | 6 | 5 | 115128 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10881 | 0 | 4.3 | 59.9 | 3754 | 40786 | 1 | 5.0 | 2 | 150 | 4.4 | 7 | 6 | 5 | 609 |
| 10882 | 0 | 4.6 | 59.9 | 16937 | 68564 | 2 | 24.0 | 3 | 1155 | 4.6 | 4 | 7 | 5 | 2138 |
| 10883 | 0 | 4.5 | 59.9 | 1963 | 848521 | 14 | 1.5 | 2 | 760 | 4.4 | 6 | 5 | 0 | 2 |
| 10884 | 0 | 3.7 | 59.9 | 68 | 6057 | 1 | 19.5 | 2 | 30 | 3.4 | 23 | 6 | 0 | 14 |
| 10885 | 0 | 4.6 | 59.9 | 596 | 596 | 1 | 10.0 | 2 | 515 | 4.6 | 10 | 6 | 0 | 128 |
10886 rows × 14 columns
df.describe(include='all')
| best_seller | course_rating_avarage | course_price | num_of_buyers_students | amount_of_instructor_studnets | num_articles | video_time_length | number_of_languages | last_update | instructor_rank | amount_of_what_you_will_learn_count | amount_of_requirments_count | amount_of_companies_support | course_rating_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10886.000000 | 10886.000000 | 10886.000000 | 1.088600e+04 | 1.086400e+04 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 |
| mean | 0.078082 | 4.223278 | 64.202407 | 1.271581e+04 | 1.815136e+05 | 8.607110 | 12.058975 | 2.302958 | 517.665350 | 4.316213 | 8.482914 | 5.871027 | 1.335660 | 1573.600772 |
| std | 0.268313 | 0.807010 | 45.094440 | 4.390705e+04 | 3.641146e+05 | 18.917087 | 15.344214 | 1.321281 | 592.760663 | 0.469917 | 7.611141 | 0.605663 | 2.212411 | 10046.506905 |
| min | 0.000000 | 0.000000 | 53.900000 | 0.000000e+00 | 1.000000e+00 | 1.000000 | 1.000000 | 2.000000 | 0.000000 | 0.000000 | 1.000000 | 5.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 4.100000 | 59.900000 | 2.660000e+02 | 3.822750e+03 | 1.000000 | 3.500000 | 2.000000 | 90.000000 | 4.200000 | 4.000000 | 6.000000 | 0.000000 | 22.000000 |
| 50% | 0.000000 | 4.400000 | 59.900000 | 1.743000e+03 | 2.570500e+04 | 3.000000 | 7.000000 | 2.000000 | 275.000000 | 4.400000 | 6.000000 | 6.000000 | 0.000000 | 81.000000 |
| 75% | 0.000000 | 4.600000 | 59.900000 | 9.132750e+03 | 1.566728e+05 | 8.000000 | 14.500000 | 2.000000 | 760.000000 | 4.600000 | 10.000000 | 6.000000 | 5.000000 | 412.000000 |
| max | 1.000000 | 5.000000 | 699.900000 | 1.533407e+06 | 2.885344e+06 | 333.000000 | 194.500000 | 16.000000 | 3710.000000 | 5.000000 | 175.000000 | 9.000000 | 5.000000 | 418238.000000 |
Markdown:
# handle outliers
for col in df.columns:
if (df.dtypes[col] == int) or (df.dtypes[col] == float):
q1 = np.percentile(df[col], 25)
q3 = np.percentile(df[col], 75)
iqr_value = q3 - q1
df[col] = np.where(df[col]<q1 -1.5*iqr_value, np.nan, df[col])
df[col] = np.where(df[col]>q3 +1.5*iqr_value, np.nan, df[col])
Markdown:
df.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)
df = df.dropna(how='any')
df.info()
df.reset_index(drop=True, inplace=True)
<class 'pandas.core.frame.DataFrame'> Int64Index: 5889 entries, 4 to 10885 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 best_seller 5889 non-null int64 1 course_rating_avarage 5889 non-null float64 2 course_price 5889 non-null float64 3 num_of_buyers_students 5889 non-null Int64 4 amount_of_instructor_studnets 5889 non-null Int64 5 num_articles 5889 non-null int64 6 video_time_length 5889 non-null float64 7 number_of_languages 5889 non-null int64 8 last_update 5889 non-null int64 9 instructor_rank 5889 non-null float64 10 amount_of_what_you_will_learn_count 5889 non-null int64 11 amount_of_requirments_count 5889 non-null int64 12 amount_of_companies_support 5889 non-null int64 13 course_rating_amount 5889 non-null Int64 dtypes: Int64(3), float64(4), int64(7) memory usage: 707.4 KB
columns = ["best_seller", "course_rating_avarage", "course_price", "num_of_buyers_students", "amount_of_instructor_studnets", "num_articles", "video_time_length", "number_of_languages", "last_update", "instructor_rank", "amount_of_what_you_will_learn_count", "amount_of_requirments_count", "amount_of_companies_support", "course_rating_amount"]
file_name = 'UDEMY_DATA_after_cleaning.csv'
df.to_csv(file_name, header=columns, index=False)
pok = pd.read_csv('UDEMY_DATA_after_cleaning.csv', header=0, sep=',')
pok.head()
| best_seller | course_rating_avarage | course_price | num_of_buyers_students | amount_of_instructor_studnets | num_articles | video_time_length | number_of_languages | last_update | instructor_rank | amount_of_what_you_will_learn_count | amount_of_requirments_count | amount_of_companies_support | course_rating_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 4.6 | 59.9 | 542316 | 2853435 | 13 | 25.0 | 9 | 760 | 4.6 | 16 | 6 | 5 | 115128 |
| 1 | 0 | 4.6 | 59.9 | 136543 | 59799 | 4 | 9.0 | 8 | 425 | 4.7 | 8 | 7 | 5 | 40180 |
| 2 | 0 | 4.5 | 59.9 | 224747 | 547988 | 1 | 13.5 | 2 | 30 | 4.5 | 18 | 7 | 5 | 20177 |
| 3 | 0 | 4.5 | 59.9 | 93333 | 704228 | 1 | 2.5 | 7 | 30 | 4.5 | 1 | 6 | 5 | 26088 |
| 4 | 0 | 4.6 | 59.9 | 10352 | 395969 | 39 | 20.0 | 2 | 60 | 4.5 | 7 | 6 | 5 | 478 |
# Counts of Best Sellers
fig = px.pie(df, names = "best_seller",
title = "<b>Counts in best_seller</b>",
color_discrete_sequence=px.colors.sequential.Blackbody_r,
hole = 0.5)
fig.update_layout(title_x = 0.5,
title_font = dict(size = 20))
fig.update_traces(textposition='inside',
textinfo='percent+label',
textfont_size=15,
marker=dict(line=dict(color='#000000', width = 1.5)))
fig.show()
markdown:
# Heatmap
plt.figure(figsize=(10,7))
sns.heatmap(df.corr())
<AxesSubplot:>
df.groupby('best_seller').agg({"num_of_buyers_students":'max',"amount_of_instructor_studnets":'max'}).plot(kind = 'bar', rot=1, figsize=(12,5))
plt.title("Visualizing Maximum Number of Buyers & Amount of Instructors\n", size=15)
plt.grid()
plt.legend(loc='upper left')
plt.xlabel("Best sellers", size=12)
plt.ylabel("Maximum Count", size=12)
plt.show()
Markdown:
df.groupby('best_seller').agg({'video_time_length':'max',
'number_of_languages':'max'}).plot(kind = 'bar', rot=1, figsize=(12,5))
plt.title("Visualizing Counts Video time length & Number of languages\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Maximum Count", size=12)
plt.show()
Markdown:
df.groupby('best_seller').agg({'course_rating_amount':'mean',
'num_of_buyers_students':'mean'}).plot(kind = 'bar', rot=1, figsize=(12,5))
plt.title("Visualizing Average Course rating amount & Number of Buyer students\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Mean", size=12)
plt.show()
df.groupby('best_seller').agg({'video_time_length':'mean',
'number_of_languages':'mean'}).plot(kind = 'bar', rot=1, figsize=(12,5))
plt.title("Visualizing Average Video time length & Number of languages\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Maximum Count", size=12)
plt.show()
Markdown:
df.groupby('best_seller').agg({'num_of_buyers_students':'max'}).plot(kind = 'bar', rot=1, figsize=(12,5))
plt.title("Visualizing Number of buyer students\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Maximum Count", size=12)
plt.show()
Markdown:
df.groupby('best_seller').agg({'number_of_languages':'max'}).plot(kind = 'bar', rot=1, figsize=(12,5))
plt.title("Visualizing Maximum Number of languages\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Maximum Count", size=12)
plt.show()
df.groupby('best_seller').agg({'course_rating_avarage':'mean',
'instructor_rank':'min'}).plot(kind = 'bar', rot=1, figsize=(12,5))
plt.title("Visualizing Average Course rating & Minimum Instructor Rank\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Mean Count", size=12)
plt.show()
Markdown:
df.groupby('best_seller').agg({'num_articles':'mean',
'video_time_length':'mean'}).plot(kind = 'bar', rot=1, figsize=(12,5))
plt.title("Visualizing Average Number of articles & Video time length\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Mean Count", size=12)
plt.show()
Markdown:
df.groupby('best_seller').agg({'course_rating_avarage':'mean',
'amount_of_companies_support':'mean'}).plot(kind = 'bar', rot=1, figsize=(12,5))
plt.title("Visualizing Average Course Rating & Amount of Companies supports\n", size=15)
plt.grid()
plt.xlabel("Best sellers", size=12)
plt.ylabel("Count", size=12)
plt.show()
correlations = []
for i in range(1,df.corr().values.shape[0]):
for j in range(1,df.corr().values.shape[1]):
if i < j and df.corr().values[i][j] >= 0.5:
correlations.append(df.corr().values[i][j])
tuple_arr = []
for i in range(1,df.corr().values.shape[0]):
for j in range(1,df.corr().values.shape[1]):
if i < j and df.corr().values[i][j] >= 0.5:
tuple_arr.append((i,j))
print(correlations)
print(tuple_arr)
cols_for_correlations = ['best_seller', 'course_rating_avarage', 'course_price', 'num_of_buyers_students', 'amount_of_instructor_studnets', 'num_articles', 'video_time_length'
,'number_of_languages', 'last_update', 'instructor_rank', 'amount_of_what_you_will_learn_count', 'amount_of_requirments_count','amount_of_companies_support', 'course_rating_amount']
indx_sort = np.argsort(correlations)
for n_correlation in indx_sort:
col_lt, col_rt = tuple_arr[n_correlation]
col_name_lt, col_name_rt = cols_for_correlations[col_lt], cols_for_correlations[col_rt]
title = "corr('%s', '%s')=%4.2f" %(col_name_lt, col_name_rt, correlations[n_correlation])
print(title)
[0.5872909486255095, 0.6704233870682552, 0.5764588553470059]
[(1, 9), (3, 13), (7, 13)]
corr('number_of_languages', 'course_rating_amount')=0.58
corr('course_rating_avarage', 'instructor_rank')=0.59
corr('num_of_buyers_students', 'course_rating_amount')=0.67
Markdown:
#verly low - very high
RANGE = 5
labels=[]
for i in range(RANGE):
labels.append(i + 1)
bins=[]
divider_value = df['num_of_buyers_students'].max() / RANGE
bin_val=0
for value in range(RANGE):
bins.append(math.floor(bin_val))
bin_val = bin_val + divider_value
bins.append(df['num_of_buyers_students'].max())
df['num_of_buyers_students'] = pd.cut(df['num_of_buyers_students'], bins=bins, labels=labels)
D:\Anaconda\Anaconda3\envs\mlcourse\lib\site-packages\ipykernel_launcher.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
#verly low , low, average, high, very high
#1 ,2 ,3 ,4 ,5
RANGE = 5
labels=[]
for i in range(RANGE):
labels.append(i + 1)
bins=[]
divider_value = df['amount_of_instructor_studnets'].max() / RANGE
bin_val=0
for value in range(RANGE):
bins.append(math.floor(bin_val))
bin_val = bin_val + divider_value
bins.append(df['amount_of_instructor_studnets'].max())
df['amount_of_instructor_studnets'] = pd.cut(df['amount_of_instructor_studnets'], bins=bins, labels=labels)
D:\Anaconda\Anaconda3\envs\mlcourse\lib\site-packages\ipykernel_launcher.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df1 = df.copy()
# df1 = df1.drop(['course_rating_avarage', 'last_update', 'course_rating_amount'], axis=1)
# X = df1.loc[:, df1.columns != 'best_seller']
X = df1.loc[:, df.columns != 'best_seller']
y = df1['best_seller']
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=42, test_size=0.2)
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(XTrain, yTrain)
y_pred=clf.predict(XTest)
print('confusion matrix:\n ',metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))
scores = cross_val_score(clf, X, y, cv=10)
print("Accuracy: %0.2f" % scores.mean())
print("Recall: %0.2f" % metrics.recall_score(y_true = yTest, y_pred = y_pred))
print("Precision: %0.2f" % metrics.precision_score(y_true = yTest, y_pred = y_pred))
print("F1: %0.2f" % metrics.f1_score(y_true = yTest, y_pred = y_pred))
confusion matrix: [[1068 28] [ 54 28]] Accuracy: 0.93 Recall: 0.34 Precision: 0.50 F1: 0.41
decisionTree = tree.DecisionTreeClassifier()
decisionTree = decisionTree.fit(XTrain, yTrain)
y_pred = decisionTree.predict(XTest)
print('confusion matrix:\n ',metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))
print('Accuracy: ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))
print("Recall: %0.2f" % metrics.recall_score(y_true = yTest, y_pred = y_pred))
print("Precision: %0.2f" % metrics.precision_score(y_true = yTest, y_pred = y_pred))
print("F1: %0.2f" % metrics.f1_score(y_true = yTest, y_pred = y_pred))
confusion matrix: [[1052 44] [ 38 44]] Accuracy: 0.9303904923599321 Recall: 0.54 Precision: 0.50 F1: 0.52
gnb = GaussianNB()
gnb.fit(XTrain,yTrain)
y_pred = gnb.predict(XTest)
print('confusion matrix:\n ',metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))
print('Accuracy: ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))
print("Recall: %0.2f" % metrics.recall_score(y_true = yTest, y_pred = y_pred))
print("Precision: %0.2f" % metrics.precision_score(y_true = yTest, y_pred = y_pred))
print("F1: %0.2f" % metrics.f1_score(y_true = yTest, y_pred = y_pred))
confusion matrix: [[1035 61] [ 64 18]] Accuracy: 0.8938879456706282 Recall: 0.22 Precision: 0.23 F1: 0.22
Best sellers has higher number of companies that supports compare to non best sellers
So, if you would like to succeed with your video and to get higher exposure, we would reccomend you to create a lot of videos, to teach the course at least in a few languages and to plan anything before publishing. Because as we can see, for best seller it is not that important to update the course sometimes, but you would like to rush as higher rating as you can from start.